from pdf2image import convert_from_path
import pytesseract

# 指定 Tesseract 的安装路径
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# 指定 Poppler 的路径
poppler_path = r'D:\software\poppler-23.11.0\poppler-23.11.0\Library\bin'  # 替换为你的 Poppler 路径

# 指定 PDF 文件路径
pdf_path = r'd:\test.pdf'  # 使用原始字符串避免转义问题

# 将PDF转换为图片
try:
    images = convert_from_path(pdf_path, poppler_path=poppler_path,dpi=300)
    print(f"成功读取 PDF 文件，共 {len(images)} 页。")
except Exception as e:
    print(f"读取 PDF 文件失败: {e}")
    exit()

# 对每张图片进行OCR识别
for i, image in enumerate(images):
    text = pytesseract.image_to_string(image.convert('L'), lang='chi_sim')  # 中文识别
    print(f"Page {i+1}:\n{text}\n")
